library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(xgboost)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.5
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ dplyr::combine()         masks randomForest::combine()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ ggplot2::margin()        masks randomForest::margin()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ dplyr::slice()           masks xgboost::slice()
## ✖ lubridate::union()       masks base::union()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(segmented)
source('functions.r')

load("Table_construction.Rdata") 
features = features %>%
  # Add other useful information:
  inner_join(
    data_before %>% 
      select(person_id, screening_date, people) %>%
      unnest() %>%
      select(person_id, screening_date, race, sex, name),
    by = c("person_id","screening_date")
  ) %>%
  inner_join(features_on, by = c("person_id","screening_date")) %>%
  inner_join(outcomes, by = c("person_id","screening_date")) %>%
  #duplicate columns
  select(-c(offenses_within_30.y)) %>%
  
  # Create as many features as possible:
  mutate(
    raw_score = `Risk of Violence_raw_score`, # Adjust for violent/general
    decile_score = `Risk of Violence_decile_score`, # Adjust for violent/general
    p_juv_fel_count = pmin(p_juv_fel_count,2),
    p_felprop_violarrest = pmin(p_felprop_violarrest,5),
    p_murder_arrest = pmin(p_murder_arrest,3),
    p_felassault_arrest = pmin(p_felassault_arrest,3),
    p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
    #p_famviol_arrest = pmin(p_famviol_arrest,3),
    p_sex_arrest = pmin(p_sex_arrest,3),
    p_weapons_arrest = pmin(p_weapons_arrest,3),
    p_n_on_probation = pmin(p_n_on_probation,5),
    p_current_on_probation = pmin(p_current_on_probation,5),
    p_prob_revoke = pmin(p_prob_revoke,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    
    # Subscales:
    vio_hist = p_juv_fel_count+
      p_felprop_violarrest+
      p_murder_arrest+
      p_felassault_arrest+
      p_misdemassault_arrest+
      #p_famviol_arrest+ #because no observations have nonzero for this
      p_sex_arrest+
      p_weapons_arrest,
    history_noncomp = p_prob_revoke+
      p_probation+p_current_on_probation+
      p_n_on_probation,
    
    # Filters (TRUE for obserations to keep)
    filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1, 
    filt2 = offenses_within_30.x == 1,
    filt3 = !is.na(current_offense_date), 
    filt4 = current_offense_date <= current_offense_date_limit, 
    filt5 = p_current_age > 18 & p_current_age <= 65,  
    filt6 = vio_hist == 0 & history_noncomp==0
  )

Risk of Violence Modelled age with a 4th degree polynomial. See broward_sheriff_data for our efforts to model it with a variety of exponentials (couldn’t get it quite right so used a polynomial in the end). ## Fit age polynomial

features_age_poly = features %>%
  filter(filt1,filt5, filt6) 

lb_age = features_age_poly %>%
  group_by(p_current_age) %>%
  arrange(raw_score) %>%
  top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value


mdl_age = lm(raw_score ~ 
               I(p_current_age^4) + 
               I(p_current_age^3) + 
               I(p_current_age^2) + 
               p_current_age, 
             data=lb_age)

# More precision for paper
summary(mdl_age)
## 
## Call:
## lm(formula = raw_score ~ I(p_current_age^4) + I(p_current_age^3) + 
##     I(p_current_age^2) + p_current_age, data = lb_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57803 -0.00688  0.00225  0.03572  0.13691 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         1.377e+00  1.007e+00   1.368  0.17505   
## I(p_current_age^4)  4.045e-07  4.368e-07   0.926  0.35718   
## I(p_current_age^3) -8.495e-05  7.258e-05  -1.170  0.24529   
## I(p_current_age^2)  7.052e-03  4.347e-03   1.622  0.10861   
## p_current_age      -3.010e-01  1.106e-01  -2.721  0.00798 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08883 on 81 degrees of freedom
## Multiple R-squared:  0.9793, Adjusted R-squared:  0.9783 
## F-statistic: 957.8 on 4 and 81 DF,  p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "1.37748225676627056302e+00"  "4.04470595978981737175e-07" 
## [3] "-8.49491450689363584601e-05" "7.05181080242555605869e-03" 
## [5] "-3.01011912451752128295e-01"
## Add f(age) to features and add filter
features = features %>%
  mutate(
    age_poly = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_poly = raw_score - age_poly,
    filt7 = raw_score >= age_poly - 0.05
  )

## Add same columns as above to lb_age 
lb_age = lb_age %>% 
    mutate(
    age_poly = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_poly = raw_score - age_poly,
    filt7 = raw_score >= age_poly - 0.05
    )

Plotting settings for age analysis

xmin = 18
xmax = 65
xx = seq(xmin,xmax, length.out = 1000)

Age polynomial plot

Generate a preliminary plot of age vs COMPAS violence score

ggplot()+
  geom_point(aes(x=p_current_age, raw_score, color = factor(filt7)),alpha=.3, data=lb_age) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("Violent score") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_age_poly) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("Violent score") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

#Filter out outliers under the age polynomial, individuals with 
#noncomp history/violence history != 0
features_age_poly2 = features %>% 
    filter(filt1, filt5, filt6, filt7) 
    # filter(p_age_first_offense == p_current_age)

lb_filt = features_age_poly2 %>%
  group_by(p_current_age) %>%
    arrange(raw_score)%>%
    top_n(n=-1, wt = raw_score)

lb_values = lb_filt %>%  
    select(p_current_age, raw_score)%>%
    group_by(p_current_age) %>%
      summarize(raw_score = unique(raw_score), 
                n_inds = n()) %>%
    arrange(p_current_age)

#filtered out points 
lb_outliers = rbind(
                #reason not included in lb_filt was  
                #age not above age poly
                setdiff(lb_age, lb_filt) %>%
                  filter(!filt7) %>% #below age poly
                  mutate(reason = "Below age polynomial")
                                ) %>%
              select(p_current_age, p_age_first_offense, raw_score, reason)

lb = lb_filt %>% 
     select(p_current_age, p_age_first_offense, raw_score) %>%
     mutate(reason = "Used to construct linear splines") %>%
     rbind(lb_outliers)

Generating new age spline.

mdl_age_spline <- segmented(lm(raw_score ~ p_current_age, data = lb_filt), 
                            seg.Z = ~p_current_age, psi = list(p_current_age = c(22,36,44)),
  control = seg.control(display = FALSE)
)

#Add Filter 9
features = features %>%
  mutate(
    age_spline = predict(mdl_age_spline, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_spline = raw_score - age_spline,
    filt8 = raw_score >= age_spline - 0.05
  )

Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.

break1 =  summary.segmented(mdl_age_spline)$psi[1,2]
break2 = summary.segmented(mdl_age_spline)$psi[2,2]
break3 = summary.segmented(mdl_age_spline)$psi[3,2]


xx1 = seq(xmin,break1, length.out=500)
xx2 = seq(break1,break2, length.out=500)
xx3 = seq(break2,break3, length.out=500)
xx4 = seq(break3,xmax, length.out=500)

age_spline_viol = 
  ggplot()+
  geom_point(aes(x=p_current_age, raw_score, color= factor(reason)),alpha=.5, data=lb ) +
  scale_color_manual(values=c("red", "grey25")) + 
  geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
                color="#F8766D", size = 1) +
  geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
                color="darkorange1", size = 1) +
  geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
                color="cyan3", size = 1) +
  geom_line(aes(x=xx4, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx4))),
                color="seagreen3", size = 1) +
  theme_bw()+
  xlim(xmin,xmax)+
  labs(
    x = "\n Age at COMPAS screening date", 
    y = "Violence Score \n"
  ) +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position = c(.95,.95),
        legend.title = element_blank(),
        # legend.key.width = unit(1.5, "cm"),
        legend.justification = c("right", "top"),
        legend.key = element_rect(fill = "aliceblue"),
        legend.background = element_rect(fill="aliceblue",
                                  size=0.5, linetype="solid")
        )
age_spline_viol

ggsave("Figures/age_LB_violent.pdf",width = 3.5, height = 2.5, units = "in")
# Age spline with all data (filters 1,5 still applied)
# Red points are below the age polynomial not age spline
ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#F8766D", data=features %>% filter(filt1,filt5,!filt7)) + # Age outliers
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF", alpha=.3, data=features %>% filter(filt1,filt5,filt7)) + # Not age outliers
  geom_line(aes(x=xx, predict(mdl_age_spline, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("General score") +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") 

ggsave("Figures/age_LB_alldata_violent.pdf",width = 3.5, height = 2.5, units = "in")

Fit history of violence lower bound

### Compute lower bound of raw_score__age_spline for each vio_hist value:

# Apply filters:
features_vio_hist = features %>%
  filter(filt1,filt3) %>% # Careful, filter 6 applied later since we need these points for plot
  select(vio_hist, raw_score__age_spline,filt8)

# Compute lower bound
lb_vio_hist = features_vio_hist %>%
  filter(filt8) %>% # Now apply filter 6
  select(-filt8) %>%
  group_by(vio_hist)%>%
  top_n(n=-1,wt=raw_score__age_spline)%>%
  rename(g_vio_hist = raw_score__age_spline) %>%
  #arrange(vio_hist,.by_group=TRUE) %>%
  arrange(vio_hist) %>%
  ungroup()

# Use last value of g_vio_hist if vio_hist > vio_hist_cutoff
vio_hist_cutoff = 6
lb_vio_hist_cutoff = lb_vio_hist$g_vio_hist[lb_vio_hist$vio_hist==vio_hist_cutoff]

lb_vio_hist = lb_vio_hist %>%
  mutate(g_vio_hist = if_else(vio_hist < vio_hist_cutoff, g_vio_hist, lb_vio_hist_cutoff))

# Add g(vio_hist) to features
features = features %>%
  left_join(lb_vio_hist, by="vio_hist") %>%
  mutate(raw_score__age_spline__g_vio_hist = raw_score__age_spline - g_vio_hist)
lb_vio_hist_plot = 
  data.frame(vio_hist_xx = seq(0,13,length.out=10000)) %>%
  mutate(vio_hist = round(vio_hist_xx)) %>%
  left_join(lb_vio_hist,by="vio_hist")

ggplot() +
  geom_point(data=features_vio_hist,aes(x=vio_hist,y=raw_score__age_spline,color=filt8),alpha=.5) +
  geom_line(data=lb_vio_hist_plot,aes(x=vio_hist_xx,y=g_vio_hist,color="lb"))+
  theme_bw()+
  xlab("Sum of History of Violence Components") +
  ylab(expression(Violent~score~-~f[viol~age])) + 
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") +
  scale_color_manual(name=element_blank(),
                        breaks=c("TRUE", "FALSE","lb"),
                        labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age]),expression(g[viol~hist])),
                     values=c("TRUE"="#619CFF","FALSE"="#00BA38","lb"="#F8766D"))

rm(lb_vio_hist_plot)
ggsave("Figures/vio_hist_LB_violent.pdf",width = 5, height = 3, units = "in")

Examine history of noncompliance lower bound (none found)

features %>%
  filter(filt1,filt3) %>% 
  select(history_noncomp, raw_score__age_spline__g_vio_hist, filt8) %>%
  ggplot() +
  geom_point(aes(x=history_noncomp,y=raw_score__age_spline__g_vio_hist,color=filt8),alpha=.5) +
  theme_bw()+
  xlab("Sum of History of Noncompliance Components") +
  ylab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist]))  +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") +
  scale_color_manual(name=element_blank(),
                       breaks=c("TRUE", "FALSE"),
                       labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age])),
                       values=c("TRUE"="#619CFF","FALSE"="#00BA38"))

ggsave("Figures/hist_noncomp_LB_violent.pdf",width = 5, height = 3, units = "in")

Predictions of (raw - age polynomial) using several ML methods

There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race

#### Generic stuff (applies to all models)

## Filter rows
features_filt = features %>%
  filter(filt1, filt3) 

## Set parameters (each combination will be run)
param <- list(objective = "reg:linear",
              eval_metric = "rmse",
              eta = c(.05,.1),
              gamma = c(.5, 1), 
              max_depth = c(2,5),
              min_child_weight = c(5,10),
              subsample = c(1),
              colsample_bytree = c(1)
)

# svm
param_svm = list(
  type = 'eps-regression',
  cost = c(0.5,1,2),
  epsilon = c(0.5,1,1.5),
  gamma_scale = c(0.5,1,2)
)

res_rmse = data.frame(Group = 1:4, lm = NA, xgb = NA, rf = NA, svm = NA)

Group 1 models: predicting (raw - age polynomial) without using age variables or race

### Create group 1 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0423 -0.3256 -0.1199  0.2067  3.7693 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.635535   0.009108  69.776  < 2e-16 ***
## p_age_first_offense    -0.002209   0.000287  -7.696 1.46e-14 ***
## p_juv_fel_count         0.128035   0.018591   6.887 5.86e-12 ***
## p_felprop_violarrest    0.117539   0.005655  20.785  < 2e-16 ***
## p_murder_arrest         0.117353   0.045875   2.558   0.0105 *  
## p_felassault_arrest     0.190687   0.010863  17.554  < 2e-16 ***
## p_misdemassault_arrest  0.183398   0.010030  18.285  < 2e-16 ***
## p_sex_arrest            0.182693   0.038842   4.703 2.57e-06 ***
## p_weapons_arrest        0.111707   0.014999   7.448 9.85e-14 ***
## p_n_on_probation        0.099579   0.004048  24.598  < 2e-16 ***
## p_current_on_probation  0.155303   0.019293   8.050 8.73e-16 ***
## p_prob_revoke           0.201229   0.015975  12.596  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4807 on 21296 degrees of freedom
## Multiple R-squared:  0.2281, Adjusted R-squared:  0.2277 
## F-statistic: 572.2 on 11 and 21296 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(46)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("Violent score - f(age)") +
  ylab("XGBoost prediction")+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(55656)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             10              
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.08333333"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 2 models: predicting (raw - age polynomial) without using age variables but with race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)


## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0551 -0.3131 -0.1100  0.2026  3.6229 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.4133605  0.0155902  26.514  < 2e-16 ***
## p_age_first_offense    -0.0003337  0.0002917  -1.144  0.25263    
## p_juv_fel_count         0.1186751  0.0182569   6.500 8.19e-11 ***
## p_felprop_violarrest    0.1136343  0.0055519  20.468  < 2e-16 ***
## p_murder_arrest         0.1206917  0.0450288   2.680  0.00736 ** 
## p_felassault_arrest     0.1862307  0.0106634  17.465  < 2e-16 ***
## p_misdemassault_arrest  0.1829102  0.0098450  18.579  < 2e-16 ***
## p_sex_arrest            0.1739793  0.0381234   4.564 5.06e-06 ***
## p_weapons_arrest        0.1016765  0.0147282   6.904 5.22e-12 ***
## p_n_on_probation        0.0942367  0.0039778  23.691  < 2e-16 ***
## p_current_on_probation  0.1483990  0.0189419   7.834 4.93e-15 ***
## p_prob_revoke           0.1936704  0.0156821  12.350  < 2e-16 ***
## race_black              0.2672590  0.0136012  19.650  < 2e-16 ***
## race_white              0.1243048  0.0136794   9.087  < 2e-16 ***
## race_hispanic           0.0326662  0.0162508   2.010  0.04443 *  
## race_asian             -0.0631597  0.0447106  -1.413  0.15778    
## race_native             0.1526455  0.0603705   2.528  0.01146 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4718 on 21291 degrees of freedom
## Multiple R-squared:  0.2567, Adjusted R-squared:  0.2561 
## F-statistic: 459.5 on 16 and 21291 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(390)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("Violent score - f(age)") +
  ylab("XGBoost prediction")+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(2728)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             2               
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "0.5"           
## gamma       "0.02941176"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 3 models: predicting (raw - age polynomial) without using race but with age variables

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9315 -0.3150 -0.1150  0.1986  3.7819 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.5544851  0.0094229  58.845  < 2e-16 ***
## p_current_age           0.0184993  0.0006723  27.517  < 2e-16 ***
## p_age_first_offense    -0.0194415  0.0006868 -28.306  < 2e-16 ***
## p_juv_fel_count         0.1165548  0.0182743   6.378 1.83e-10 ***
## p_felprop_violarrest    0.0893107  0.0056510  15.804  < 2e-16 ***
## p_murder_arrest         0.0880189  0.0450946   1.952  0.05097 .  
## p_felassault_arrest     0.1478925  0.0107875  13.710  < 2e-16 ***
## p_misdemassault_arrest  0.1631086  0.0098843  16.502  < 2e-16 ***
## p_sex_arrest            0.1136560  0.0382527   2.971  0.00297 ** 
## p_weapons_arrest        0.0764478  0.0147949   5.167 2.40e-07 ***
## p_n_on_probation        0.0819168  0.0040296  20.329  < 2e-16 ***
## p_current_on_probation  0.1630972  0.0189614   8.602  < 2e-16 ***
## p_prob_revoke           0.0790314  0.0163148   4.844 1.28e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4724 on 21295 degrees of freedom
## Multiple R-squared:  0.2546, Adjusted R-squared:  0.2542 
## F-statistic: 606.3 on 12 and 21295 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(34)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(Violent~score~-~f[viol~age])) +
  ylab("XGBoost prediction") +
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw-fage_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(7872)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             3               
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "0.5"           
## gamma       "0.03846154"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 4 models: predicting (raw - age polynomial) using age variables and race

### Create group 4 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9482 -0.3031 -0.1082  0.1942  3.6359 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.3466613  0.0155250  22.329  < 2e-16 ***
## p_current_age           0.0179295  0.0006617  27.094  < 2e-16 ***
## p_age_first_offense    -0.0170374  0.0006799 -25.057  < 2e-16 ***
## p_juv_fel_count         0.1076304  0.0179551   5.994 2.08e-09 ***
## p_felprop_violarrest    0.0864735  0.0055500  15.581  < 2e-16 ***
## p_murder_arrest         0.0912311  0.0442864   2.060  0.03941 *  
## p_felassault_arrest     0.1449762  0.0105944  13.684  < 2e-16 ***
## p_misdemassault_arrest  0.1631535  0.0097072  16.808  < 2e-16 ***
## p_sex_arrest            0.1073541  0.0375641   2.858  0.00427 ** 
## p_weapons_arrest        0.0674220  0.0145361   4.638 3.53e-06 ***
## p_n_on_probation        0.0772093  0.0039612  19.491  < 2e-16 ***
## p_current_on_probation  0.1559156  0.0186261   8.371  < 2e-16 ***
## p_prob_revoke           0.0757706  0.0160212   4.729 2.27e-06 ***
## race_black              0.2547357  0.0133809  19.037  < 2e-16 ***
## race_white              0.1076170  0.0134639   7.993 1.38e-15 ***
## race_hispanic           0.0327969  0.0159781   2.053  0.04012 *  
## race_asian             -0.0510765  0.0439625  -1.162  0.24532    
## race_native             0.1284837  0.0593640   2.164  0.03045 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4639 on 21290 degrees of freedom
## Multiple R-squared:  0.2815, Adjusted R-squared:  0.2809 
## F-statistic: 490.6 on 17 and 21290 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(11)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(Violent~score~-~f[viol~age])) +
  ylab("XGBoost prediction")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12))

ggsave("Figures/raw-fage_xgboost_gp4_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

highlight = data.frame(
  person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
  screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
  highlight = TRUE
)

df_plot = features_filt %>%
  bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
  left_join(highlight, by = c("person_id","screening_date")) %>%
  mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
  mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))

person_id_text_topright = c(8491, 8375, 1497)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(5722, 11231)
person_id_text_botright = c(799, 11312, 1284, 11414)
person_id_text_botleft = c()

ggplot() +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  data = filter(df_plot, highlight=="In Table 5")) +
  theme_bw()+
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) + 
  xlab(expression(XGBoost~pred.~of~violent~score~-~f[age])) +
  ylab("Violent score")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12),
    #legend.position = "top",
    legend.position="none") +
  scale_color_discrete(name = element_blank())

ggsave("Figures/xgboost_rawScore_annotate_violent.pdf",width = 3, height = 3, units = "in")

Model 3: random forest

set.seed(379)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             12              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.05555556"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
1 0.4805623 0.4644645 0.4727648 0.4712996
2 0.4715878 0.4532862 0.4609155 0.4626015
3 0.4722399 0.4503872 0.4605618 0.4618827
4 0.4636618 0.4394210 0.4470148 0.4475470

Predictions using xgboost only

We use the “Group 3” models but predict the raw score and the raw score minus all of the lower bounds we have fitted. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound only.

Predicting the raw score

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score) %>% as.matrix(),
  "label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(347)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4506"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("Violent score") +
  ylab("XGBoost prediction")+
  #annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")

Predicting the raw score - f(age) - g(vio_hist)

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count,
    p_felprop_violarrest,
    p_murder_arrest,
    p_felassault_arrest,
    p_misdemassault_arrest,
    #p_famviol_arrest,
    p_sex_arrest,
    p_weapons_arrest,
    p_n_on_probation,
    p_current_on_probation,
    p_prob_revoke,
    raw_score__age_spline__g_vio_hist)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline__g_vio_hist) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline__g_vio_hist) %>% as.matrix()
)
set.seed(7301)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  6           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__g_vio_hist

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4506"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist])) +
  ylab("XGBoost prediction")+
  #annotate("text", x = 0.5, y = 4, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw-fage-gviohist_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")

Replicating ProPublica logistic regression

propub = features_filt %>%
  filter(filt4) %>% # Only people with valid recidivism values
  mutate(age_low = if_else(p_current_age < 25,1,0), 
         age_high = if_else(p_current_age > 45,1,0),
         female = if_else(sex=="Female",1,0),
         n_priors = p_felony_count_person + p_misdem_count_person,
         compas_high = if_else(`Risk of Violence_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
         race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis

print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 13505"
mdl_glm = glm(compas_high ~
                female +
                age_high +
                age_low +
                as.factor(race) +
                p_charge +
                is_misdem +
                recid_violent,
              family=binomial(link='logit'), data=propub)

summary(mdl_glm)
## 
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) + 
##     p_charge + is_misdem + recid_violent, family = binomial(link = "logit"), 
##     data = propub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6043  -0.5313  -0.3066   0.7059   3.0149  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -2.614956   0.065330 -40.027  < 2e-16 ***
## female                          -0.584530   0.065149  -8.972  < 2e-16 ***
## age_high                        -1.576448   0.137479 -11.467  < 2e-16 ***
## age_low                          2.775651   0.054379  51.043  < 2e-16 ***
## as.factor(race)African-American  0.728572   0.055913  13.030  < 2e-16 ***
## as.factor(race)Asian            -0.851084   0.444399  -1.915   0.0555 .  
## as.factor(race)Hispanic          0.101528   0.094176   1.078   0.2810    
## as.factor(race)Native American   0.412826   0.478234   0.863   0.3880    
## as.factor(race)Other             0.019089   0.111656   0.171   0.8643    
## p_charge                         0.076164   0.003581  21.269  < 2e-16 ***
## is_misdem                       -0.418826   0.052150  -8.031 9.66e-16 ***
## recid_violent                    0.715798   0.066871  10.704  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16544  on 13504  degrees of freedom
## Residual deviance: 10743  on 13493  degrees of freedom
## AIC: 10767
## 
## Number of Fisher Scoring iterations: 6